In [1]:
import pandas as pd
import numpy as np 
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
# Using graph_objects
!pip install plotly
import plotly.graph_objects as go
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
Requirement already satisfied: plotly in c:\users\azin\anaconda3\lib\site-packages (5.6.0)
Requirement already satisfied: six in c:\users\azin\anaconda3\lib\site-packages (from plotly) (1.16.0)
Requirement already satisfied: tenacity>=6.2.0 in c:\users\azin\anaconda3\lib\site-packages (from plotly) (8.0.1)
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Azin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Azin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Out[1]:
True
In [2]:
# Data loading
filename = 'Youtube02-KatyPerry.csv'
data = pd.read_csv(filename)
In [3]:
data.head(10)
Out[3]:
COMMENT_ID AUTHOR DATE CONTENT CLASS
0 z12pgdhovmrktzm3i23es5d5junftft3f lekanaVEVO1 2014-07-22T15:27:50 i love this so much. AND also I Generate Free ... 1
1 z13yx345uxepetggz04ci5rjcxeohzlrtf4 Pyunghee 2014-07-27T01:57:16 http://www.billboard.com/articles/columns/pop-... 1
2 z12lsjvi3wa5x1vwh04cibeaqnzrevxajw00k Erica Ross 2014-07-27T02:51:43 Hey guys! Please join me in my fight to help a... 1
3 z13jcjuovxbwfr0ge04cev2ipsjdfdurwck Aviel Haimov 2014-08-01T12:27:48 http://psnboss.com/?ref=2tGgp3pV6L this is the... 1
4 z13qybua2yfydzxzj04cgfpqdt2syfx53ms0k John Bello 2014-08-01T21:04:03 Hey everyone. Watch this trailer!!!!!!!! http... 1
5 z12rw1o4zvidhdthz04cixxjssq5wzsrlpk0k Nere Overstylish 2014-08-02T23:12:49 check out my rapping hope you guys like it ht... 1
6 z13xizvwrki2hf2ev22txvrp2ovcyf3zq04 Jayki L 2014-08-03T21:20:41 Subscribe pleaaaase to my instagram account , ... 1
7 z12ogvgbmre3eloah04ccjbpsmusxdxbwc0 djh3mi 2014-08-06T21:14:15 hey guys!! visit my channel pleaase (i'm searc... 1
8 z125efjyoyaxwhzhz04cgh4oaontcvvdc Manuel Ortiz 2014-08-07T17:46:23 Nice! http://www.barnesandnoble.com/s/BDP?csrf... 1
9 z12is34ysrzoy3uwl04cctlxmrekjfuhvig Mike Bennett 2014-08-07T19:40:18 http://www.twitch.tv/daconnormc 1
In [4]:
# Using graph_objects
fig = go.Figure([go.Scatter(x=data['DATE'])])
fig.show()
In [5]:
print("Size of the data set is :",len(data) ,"&&","Number of unique values in COMMENT_ID column :",data["COMMENT_ID"].nunique())
Size of the data set is : 350 && Number of unique values in COMMENT_ID column : 350
In [6]:
print("Size of the data set is :",len(data) ,"&&","Number of unique values in AUTHOR column :",data["AUTHOR"].nunique())
Size of the data set is : 350 && Number of unique values in AUTHOR column : 342
In [7]:
print("Size of the data set is :",len(data) ,"&&","Number of unique values in DATE column :",data["DATE"].nunique())
Size of the data set is : 350 && Number of unique values in DATE column : 350
In [8]:
print("Size of the data set is :",len(data) ,"&&","Number of unique values in CONTENT column :",data["CONTENT"].nunique())
Size of the data set is : 350 && Number of unique values in CONTENT column : 348
In [9]:
print("Size of the data set is :",len(data) ,"&&","Number of unique values in CLASS column :",data["CLASS"].nunique())
Size of the data set is : 350 && Number of unique values in CLASS column : 2
In [10]:
print(pd.unique(data["CLASS"]))
[1 0]
In [11]:
print(data["CLASS"].value_counts())
1    175
0    175
Name: CLASS, dtype: int64
In [12]:
data = data.drop(columns=['COMMENT_ID', 'AUTHOR','DATE'])
In [13]:
data
Out[13]:
CONTENT CLASS
0 i love this so much. AND also I Generate Free ... 1
1 http://www.billboard.com/articles/columns/pop-... 1
2 Hey guys! Please join me in my fight to help a... 1
3 http://psnboss.com/?ref=2tGgp3pV6L this is the... 1
4 Hey everyone. Watch this trailer!!!!!!!! http... 1
... ... ...
345 This song means so much to me thank you soooo... 0
346 <3 0
347 KATY PERRY, I AM THE "DÉCIO CABELO", "DECIO HA... 1
348 Honestly speaking except taylor swift and adel... 0
349 who is going to reach the billion first : katy... 0

350 rows × 2 columns

In [14]:
data_X = data["CONTENT"]
data_X
Out[14]:
0      i love this so much. AND also I Generate Free ...
1      http://www.billboard.com/articles/columns/pop-...
2      Hey guys! Please join me in my fight to help a...
3      http://psnboss.com/?ref=2tGgp3pV6L this is the...
4      Hey everyone. Watch this trailer!!!!!!!!  http...
                             ...                        
345    This song means so much to me thank you  soooo...
346                                               <3
347    KATY PERRY, I AM THE "DÉCIO CABELO", "DECIO HA...
348    Honestly speaking except taylor swift and adel...
349    who is going to reach the billion first : katy...
Name: CONTENT, Length: 350, dtype: object
In [15]:
#library that contains punctuation
import string
string.punctuation

#defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree
#storing the puntuation free text
data['CONTENT']= data['CONTENT'].apply(lambda x:remove_punctuation(x))


data['CONTENT']= data['CONTENT'].apply(lambda x: x.lower())




#defining function for tokenization
import re
def tokenization(text):
    tokens = re.split('W+',text)
    return tokens
#applying function to the column
data['CONTENT']= data['CONTENT'].apply(lambda x: tokenization(x))




#importing nlp library
import nltk
#Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')

#defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

#applying the function
data['CONTENT']= data['CONTENT'].apply(lambda x:remove_stopwords(x))



#importing the Stemming function from nltk library
from nltk.stem.porter import PorterStemmer
#defining the object for stemming
porter_stemmer = PorterStemmer()

#defining a function for stemming
def stemming(text):
    stem_text = [porter_stemmer.stem(word) for word in text]
    return stem_text
data['CONTENT']=data['CONTENT'].apply(lambda x: stemming(x))




from nltk.stem import WordNetLemmatizer
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text
data['CONTENT']=data['CONTENT'].apply(lambda x:lemmatizer(x))
In [16]:
data['CONTENT'] = data['CONTENT'].astype(str)
In [17]:
print(data["CONTENT"])
0      ['i love this so much and also i generate free...
1      ['httpwwwbillboardcomarticlescolumnspopshop617...
2      ['hey guys please join me in my fight to help ...
3      ['httppsnbosscomref2tggp3pv6l this is the song...
4      ['hey everyone watch this trailer  httpbelieve...
                             ...                        
345    ['this song means so much to me thank you  soo...
346                                        ['lt3\ufeff']
347    ['katy perry i am the décio cabelo decio hair ...
348    ['honestly speaking except taylor swift and ad...
349    ['who is going to reach the billion first  kat...
Name: CONTENT, Length: 350, dtype: object
In [18]:
data
Out[18]:
CONTENT CLASS
0 ['i love this so much and also i generate free... 1
1 ['httpwwwbillboardcomarticlescolumnspopshop617... 1
2 ['hey guys please join me in my fight to help ... 1
3 ['httppsnbosscomref2tggp3pv6l this is the song... 1
4 ['hey everyone watch this trailer httpbelieve... 1
... ... ...
345 ['this song means so much to me thank you soo... 0
346 ['lt3\ufeff'] 0
347 ['katy perry i am the décio cabelo decio hair ... 1
348 ['honestly speaking except taylor swift and ad... 0
349 ['who is going to reach the billion first kat... 0

350 rows × 2 columns

In [19]:
new_data = data.sample(frac=1)
In [20]:
new_data
Out[20]:
CONTENT CLASS
116 ['httpswwwreverbnationcomslicknick313songs\ufe... 1
75 ['roaaaaarrrrrr 🐯🐯🐯\ufeff'] 0
136 ['httpthepiratebaysetorrent10626048theexpendab... 1
145 ['this is the best of the best video in world\... 0
324 ['hey yall its the real kevin hart shout out t... 1
... ... ...
43 ['httpwwwbubblewscomnews6401116vpssolutions\uf... 1
188 ['omg i love you katy parry your songs rock th... 0
293 ['damnnnnnnnn she is sexy oo\ufeff'] 0
41 ['even without make up she is still hot h... 1
96 ['this video is very inaccurate a tiger would ... 0

350 rows × 2 columns

In [21]:
train=new_data.sample(frac=0.75,random_state=10)
test=new_data.drop(train.index)
In [22]:
train
Out[22]:
CONTENT CLASS
100 ['maybe the best music video in the last 15 ye... 0
230 ['thank you katyperryvevo for your instagram l... 1
123 ['please check out my acoustic cover channel ... 1
200 ['i rekt ur mum last nite cuz da haterz were 2... 1
73 ['shes an old whore\ufeff'] 0
... ... ...
97 ['great video by a great artist in katy perry ... 1
166 ['how old is katy perry\ufeff'] 0
287 ['οh my god roar is the most liked video at v... 0
236 ['this video is great i love thisand like muc... 0
234 ['honestly i wanna see you be brave oh wait\uf... 0

262 rows × 2 columns

In [23]:
train["CONTENT"]
Out[23]:
100    ['maybe the best music video in the last 15 ye...
230    ['thank you katyperryvevo for your instagram l...
123    ['please check out my acoustic cover channel  ...
200    ['i rekt ur mum last nite cuz da haterz were 2...
73                           ['shes an old whore\ufeff']
                             ...                        
97     ['great video by a great artist in katy perry ...
166                      ['how old is katy perry\ufeff']
287    ['οh my god  roar is the most liked video at v...
236    ['this video is great  i love thisand like muc...
234    ['honestly i wanna see you be brave oh wait\uf...
Name: CONTENT, Length: 262, dtype: object
In [24]:
train["CLASS"]
Out[24]:
100    0
230    1
123    1
200    1
73     0
      ..
97     1
166    0
287    0
236    0
234    0
Name: CLASS, Length: 262, dtype: int64
In [25]:
# Build a count vectorizer and extract term counts 
#Text preprocessing, tokenizing and filtering of stopwords are all included in
#CountVectorizer, which builds a dictionary of features and transforms documents
# to feature vectors:
count_vectorizer = CountVectorizer()
train_x = count_vectorizer.fit_transform(train["CONTENT"])
print("\nDimensions of training data:", train_x.shape)
Dimensions of training data: (262, 1123)
In [26]:
#This downscaling is called tf–idf for “Term Frequency times Inverse Document Frequency”.
# Create the tf-idf transformer
tfidf = TfidfTransformer()
train_tfidf = tfidf.fit_transform(train_x)
type(train_tfidf)
Out[26]:
scipy.sparse.csr.csr_matrix
In [27]:
print(train_tfidf)
  (0, 1106)	0.2990637735909765
  (0, 1032)	0.16481551301946937
  (0, 1009)	0.05086977803209711
  (0, 973)	0.12520477582118922
  (0, 963)	0.2520912354849456
  (0, 769)	0.2431776103230956
  (0, 678)	0.3865660878700029
  (0, 633)	0.2638035303723182
  (0, 576)	0.25245226744767046
  (0, 541)	0.2609437343524048
  (0, 527)	0.16303042536981674
  (0, 436)	0.22255169027387303
  (0, 323)	0.2990637735909765
  (0, 247)	0.2990637735909765
  (0, 114)	0.2171920242290121
  (0, 6)	0.2990637735909765
  (1, 1116)	0.2711619882676886
  (1, 1110)	0.19871572752637318
  (1, 1009)	0.0758193542508905
  (1, 957)	0.33170432615928636
  (1, 592)	0.3931885314953434
  (1, 568)	0.4457425031262238
  (1, 531)	0.3931885314953434
  (1, 327)	0.2542433883017948
  (1, 93)	0.4457425031262238
  :	:
  (259, 189)	0.2638738551741603
  (259, 95)	0.24567492397446855
  (259, 83)	0.1798597233643821
  (259, 71)	0.2638738551741603
  (259, 45)	0.2638738551741603
  (260, 1032)	0.28440707951283417
  (260, 1009)	0.08778133041315
  (260, 974)	0.5160670429430365
  (260, 973)	0.21605444766693227
  (260, 750)	0.2599412462401278
  (260, 676)	0.339196175146362
  (260, 612)	0.2754724591932725
  (260, 590)	0.2813267167738476
  (260, 564)	0.23350984060138022
  (260, 541)	0.22514338621624005
  (260, 378)	0.3943762795796847
  (261, 1110)	0.17976295224597985
  (261, 1051)	0.4032292224723303
  (261, 1050)	0.4032292224723303
  (261, 1009)	0.06858798308108237
  (261, 839)	0.3278776214044928
  (261, 718)	0.4032292224723303
  (261, 430)	0.3556876553793848
  (261, 137)	0.4032292224723303
  (261, 101)	0.2803360543115473
In [28]:
train_tfidf
Out[28]:
<262x1123 sparse matrix of type '<class 'numpy.float64'>'
	with 3413 stored elements in Compressed Sparse Row format>
In [29]:
from sklearn.naive_bayes import MultinomialNB
In [30]:
# Train a Multinomial Naive Bayes classifier
classifier = MultinomialNB().fit(train_tfidf, train["CLASS"])
In [31]:
from sklearn.model_selection import cross_val_score
In [32]:
#score of the accuracy generated with the training data
scores = cross_val_score(classifier,train_tfidf,train["CLASS"],cv=5) 
print(scores)
[0.86792453 0.90566038 0.88461538 0.88461538 0.82692308]
In [33]:
scores.mean()*100
Out[33]:
87.39477503628447
In [34]:
test_x = test["CONTENT"]
test_x
test_y = test["CLASS"]
In [35]:
# Transform the testing feature data using count vectorizer
test_tc = count_vectorizer.transform(test_x)
type(test_tc)
Out[35]:
scipy.sparse.csr.csr_matrix
In [36]:
# Transform vectorized data using tfidf transformer
test_tfidf = tfidf.transform(test_tc) 
type(test_tfidf)
Out[36]:
scipy.sparse.csr.csr_matrix
In [37]:
# Predict the output categories, fitting the transformed testing feature
y_pred = classifier.predict(test_tfidf) 

#Accuracy generated with the testing data
print(accuracy_score(test_y, y_pred)) 
print(confusion_matrix(test_y, y_pred)) 
print(classification_report(test_y, y_pred))
0.8863636363636364
[[43  4]
 [ 6 35]]
              precision    recall  f1-score   support

           0       0.88      0.91      0.90        47
           1       0.90      0.85      0.88        41

    accuracy                           0.89        88
   macro avg       0.89      0.88      0.89        88
weighted avg       0.89      0.89      0.89        88